#clear environment 
rm(list = ls())

library(readr)
library(readxl)
library(reshape2)
library(dplyr)
library(ggplot2)
library(tidyr)
library(abind)
library(zoo)
library(stringr)
library(tidylog)
library(stargazer)
library(haven) #for importing Stata .dta files

# set directories
source("./Code/set_directories.R")

ttsxi_data <- read_dta(file.path(DATA.SOLAR.SITING,"TTSXI_EOY2017.dta"))
openpv_data <- read_dta(file.path(DATA.SOLAR.SITING,"openpv_all_2018-07-22.dta"))

ttsxi_data_clean <- ttsxi_data %>% 
  mutate(zip = sprintf("%05d", zipcode)) %>% 
  mutate(year = substr(installationdate,nchar(installationdate)-4+1,nchar(installationdate))) %>% 
  mutate(year = as.numeric(year)) %>% 
  filter(year>=2016) %>% #drop if year<2016
  filter(customersegment=="RES") %>% 
  select(c(state,zip,systemsize)) %>% 
  rename(size_kw = systemsize)

openpv_data_clean <- openpv_data %>% 
  mutate(zip = sprintf("%05d", zipcode)) %>% 
  select(-year) %>% 
  mutate(year = substr(date_installed,nchar(date_installed)-4+1,nchar(date_installed))) %>% 
  mutate(year = as.numeric(year)) %>% 
  filter(year<2016) %>%  #drop if year>=2016
  filter(install_type %in% c("residential","Residential")) %>% 
  select(c(state,zip,size_kw))

systems_data_clean <- openpv_data_clean %>% 
  bind_rows(ttsxi_data_clean) %>% 
  filter(zip!=-9999) %>% 
  group_by(zip) %>% 
  summarize(state=first(state),
            total_size = sum(size_kw),
            size_kw = mean(size_kw),
            count = n()) %>% 
  mutate(count_equiv = round(total_size/4))
f_systems_data_clean <- file.path(DATA.PROCESSED,"systems_data_clean.rds")
write_rds(systems_data_clean,f_systems_data_clean)
